/*
This DO file illustrates the logic used to create the final water use panel from
raw and intermediate input files received from the water authority
*/
clear
set more off

* read in and modify xeriscape enrollment data
use "$input/1tblWSLEnroll.dta", clear 

* create variable indicating the number of conversions by a parcel
duplicates tag PARCEL, gen(conversions)
replace conversions = conversions + 1
la var conversions "number of conversions for a given parcel"

* create indicator variable for duplicate conversions
gen single = 0
replace single = 1 if conversions == 1
la var single "indicator for parcels converting only once"

* Analyze (and eventually drop) duplicate observations
sort PARCEL ENROLL_DATE

duplicates report PARCEL
duplicates report PARCEL WG // no duplicates across classification

gen year = year(ENROLL_DATE)
duplicates report PARCEL // how many parcels converted more than once?
duplicates report PARCEL year // how many converted more than once in a year?
drop year

duplicates drop PARCEL, force

* I drop consumption post 5/2014, thus any conversion after 5/2013 won't have
* +12 mos of data
gen postApril2013 = 0 
replace postApril2013 = 1 if ENROLL_DATE >= date("5/1/2013","MDY")
tab postApril2013
drop postApril2013

sort PARCEL
la data "temporary enrollment data file, PARCEL as unique id: for event study"
save "$intermediate/enrollment_TEMP.dta", replace

* ______________________________________________________________________________
* ______________________________________________________________________________
* merge consumption data with service to parcel matching data
use "$intermediate/1tblSFR_PclSvc_1to1.dta", clear // the 1:1 data I created 


use "$input/1tblSFRCons_long.dta", clear // water consumption dataset
rename cons wuse
describe

summarize wuse
summarize wuse if wuse == 0
summarize wuse if wuse < 0
summarize wuse if wuse > 0

* according to email from SNWA, set negative values to 0
replace wuse = 0 if wuse < 0

drop if year == 2014 & month >= 5 // data is incomplete data beginning 5/2014.

sort SVC_NBR year month
merge m:1 SVC_NBR using "$intermediate/1tblSFR_PclSvc_1to1.dta"
display (492500 + 1473)/_N * 100 " % of observations not matched"

* unmatched from master (1): no parcel associated with service number
* unmatched from using (2): no consumption records associated with service no
keep if _merge == 3
drop _merge

order SVC_NBR PARCEL year
sort PARCEL year month
la da "water consumption data matched to PARCEL id"
save "$intermediate/consumptionPanel.dta", replace
describe

* ______________________________________________________________________________
* ______________________________________________________________________________
* merge consumption data with xeriscape enrollment data
use "$intermediate/enrollment_TEMP.dta", clear

sort PARCEL
merge 1:m PARCEL using "$intermediate/consumptionPanel.dta"

* unmatched from master (1): participant is not in the consumption data
* unmatched from using (2): parcel is a non-participant
* matched (3): parcel is a program participant

replace conversions = 0 if _merge == 2
replace single = 0 if _merge == 2

gen enrollee = 0
replace enrollee = 1 if conversions > 0

drop if _merge==1 // drop participants with no consumption data
drop _merge

* The event year and month is relevant for single converters only
gen event_year = year(ENROLL_DATE) if single==1
gen event_month = month(ENROLL_DATE) if single==1

* create tau variable for event study
gen tau = 12*(year-event_year) + month - event_month if single == 1

* generate a numeric parcel ID variable
egen pid = group(PARCEL)
la var pid "numeric parcel id: group(PARCEL)"
sort pid

* save event study panel
compress
sort PARCEL year month
la da "Event study panel, including participants and non-participants"
save "$intermediate/eventStudyPanel_raw.dta", replace

rm "$intermediate/consumptionPanel.dta"

*_______________________________________________________________________________
* investigate how balanced is the data sample
use "$intermediate/eventStudyPanel_raw.dta", clear

* generate a count 1 to n for each parcel beginning in the first month of record
by PARCEL: gen test = _n // note: minimum count must necessarily = 1
by PARCEL: egen maxtest = max(test) // maximum count

* when do parcels exit the sample? (most should be april 2014)
  tab year month if test==maxtest

* when do parcels exit (net of those exiting in 4/14)?
  tab year month if test==maxtest & (year!=2014 & month!=4)

* when do parcels exit (by enrollee and net of 4/14ers)?
  tab enrollee if test==maxtest & (year!=2014 & month!=4)

* when do parcels exit (year by enrollee)?
  tab year enrollee if test==maxtest

* when do parcels exit (year by enrollee; net of 4/14ers)?
  tab year enrollee if test==maxtest & (year!=2014 & month!=4)

* when do parcels exit (year by enrollee; SF enrollees only)?
  tab year enrollee if test==maxtest & (enrollee==0|(enrollee==1 & WG=="SF"))

* when do parcels come online (by year and enrolee; SF enrollees only)?
  tab year enrollee if test==1 & (enrollee==0|(enrollee==1 & WG=="SF"))

* when do parcels come online (gives percent of total in each year)?
  tab year if test==1 & (enrollee==0|(enrollee==1 & WG=="SF"))

* when do non-participants come online (gives percent of total in each year)?
  tab year if test==1 & (enrollee==0)
  
* when do participants come online (gives percent of total in each year)?
  tab year if test==1 & (enrollee==1 & WG=="SF")

* from the first tabulation, I learn that any parcel exiting the sample in 2014
*  exits in april, meaning that these parcels run through to end of sample
sort PARCEL year month
by PARCEL: egen maxyear = max(year)

gen earlyx = 0
replace earlyx = 1 if maxyear<2014
la var earlyx "indicator for a parcel that exited sample before 4/14"

drop test maxtest maxyear
* save event study panel
compress
sort PARCEL year month
la da "Event study panel, including participants and non-participants"
save "$intermediate/eventStudyPanel_raw.dta", replace

*_______________________________________________________________________________
* define indicators to construct a balanced sample

use "$intermediate/eventStudyPanel_raw.dta", clear

* create a flag for parcels that are in the sample from the beginning
sort PARCEL year month
by PARCEL: egen minyear = min(year)

gen balance88 = 0
replace balance88 = 1 if minyear==1988 & earlyx == 0
la var balance88 "balanced sample: parcels from jan1 1988 to april 2014"

gen balance96 = 0
replace balance96 = 1 if minyear<=1996 & earlyx == 0
la var balance96 "parcels from/before jan1 1996 to april 2014"

drop minyear

* save event study panel
compress
sort PARCEL year month
la da "Event study panel, including participants and non-participants"
save "$intermediate/eventStudyPanel_raw.dta", replace


*_______________________________________________________________________________
* define indicators to construct a balanced sample with no empty/missing 
*  consumption values

use "$intermediate/eventStudyPanel_raw.dta", clear

by PARCEL: gen parcelobs = _n
by PARCEL: egen maxparcelobs = max(parcelobs)
tab maxparcelobs balance88

// 1988 to 2013 = 26 years x 12 + 4 months = 316
// Parcels that are continuously in the sample will have maxparcelobs = 316
gen balanced = 0
replace balanced = 1 if maxparcelobs == 316
la var balanced "parcels that run continuously from Jan. '88 to Apr. '14"

drop parcelobs maxparcelobs

* save event study panel
compress
sort PARCEL year month
la da "Event study panel, including participants and non-participants"
save "$intermediate/eventStudyPanel_raw.dta", replace

*_______________________________________________________________________________
* define some of the data used for summary stats in the raw panel
* (saves me from having to recreate it each time I define a new sample from 
*  the raw version)

use "$intermediate/eventStudyPanel_raw.dta", clear
rename _all, lower
drop single enrollee

* define new enrollement variable; defines enrollee pre-post enrollment
gen eventyearmonth = event_year + (event_month - 1)/12
gen yearmonth = year + (month - 1)/12

gen enroll2 = 0
replace enroll2 = 0 if conversions == 0
replace enroll2 = 1 if conversions >= 1 & yearmonth < eventyearmonth
replace enroll2 = 2 if conversions >= 1 & yearmonth >= eventyearmonth

la var enroll2 "enrollment by participation"
la define enroll2_defn 0 "Non-participant" 1 "Participant pre-conversion" ///
 2 "Participant post-conversion"
la define year_defn ///
1988 "1988" 1989 "1989" 1990 "1990" 1991 "1991" 1992 "1992" ///
1993 "1993" 1994 "1994" 1995 "1995" 1996 "1996" 1997 "1997" 1998 "1998" ///
1999 "1999" 2000 "2000" 2001 "2001" 2002 "2002" 2003 "2003" 2004 "2004" ///
2005 "2005" 2006 "2006" 2007 "2007" 2008 "2008" 2009 "2009" 2010 "2010" ///
2011 "2011" 2012 "2012" 2013 "2013" 2014 "2014" 

la values enroll2 enroll2_defn
la values year year_defn
drop eventyearmonth

* define xeriscape indicator
gen chi = 0
replace chi = 1 if tau >= 0 & tau !=.

* define post conversion time trend-by-xeriscape interaction
gen int chiBYtau = 0
replace chiBYtau = chi * tau if tau >= 0 & tau!=.

gen chiBYtau2 = 0
replace chiBYtau2 = chi * tau^2 if tau >= 0 & tau!=.

* define annual adopter indicators 
foreach yr of numlist 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 {
gen dummy`yr' = 0
replace dummy`yr' = 1 if event_year == `yr'
gen chiBY`yr' = chi * dummy`yr'
}

* define interaction with min conversion (march 2004)
gen minconv = 0
replace minconv = 1 if enroll_date >= date("3/1/2004","DMY") & enroll_date!=.
gen chiBYminconv = chi * minconv
la var minconv "take on 1 if enrollment is on/after march 1, 2004"

* define interaction with change from incentive to 
* fixed price per sqft on July 1, 2000
gen ppsf = 0
replace ppsf = 1 if enroll_date >= date("7/1/2000","DMY") & enroll_date!=.
gen chiBYppsf  = chi * ppsf
la var ppsf "take on 1 if enrollment is on/after July 1, 2000"

tab conversions minconv
tab conversions ppsf

* generate cohorts (by first year in sample)
bys parcel: egen int cohort = min(year) // annual cohort

gen byte cohort2 = 0
replace cohort2 = 1 if inrange(cohort, 1988, 1989)
replace cohort2 = 2 if inrange(cohort, 1990, 1994)
replace cohort2 = 3 if inrange(cohort, 1995, 1999)
replace cohort2 = 4 if inrange(cohort, 2000, 2004)
replace cohort2 = 5 if inrange(cohort, 2005, 2014)
tab cohort
tab cohort2
tab cohort cohort2
drop cohort

* reduce the size of the file, sort and save panel
compress
sort parcel year month
la da "Event study panel, including participants and non-participants"
save "$intermediate/eventStudyPanel_raw.dta", replace


*_______________________________________________________________________________
* Merge in lot size and define water consumption by lot size variable
use "$intermediate/eventStudyPanel_raw.dta", clear

// get rid of a few of these variables; this dataset is really big
drop dummy* balance88 balance96

* merge lot size data from county assessor
sort parcel year month
merge m:1 parcel using "$intermediate/ao_extract_land.dta"
sort parcel year month

preserve
duplicates drop parcel, force
tab _merge
list parcel conversions enroll_date if _merge == 1
restore

drop if _merge == 2
drop _merge

gen lotsize = landsf/1000
la var lotsize "plot size in 1000 sq-ft"

gen wuse_norm = wuse/lotsize
la var wuse_norm "monthly water consumption normalized by lot size (gal/sq-ft)

tab landuse

* reduce the size of the file (this is a really big data set), sort and save panel
compress
sort parcel year month
la da "Event study panel, including participants and non-participants"
save "$intermediate/eventStudyPanel_raw.dta", replace